This document is mainly for part3 of the GEO data set analysis, that is the quality control of data sets from different tissue types.
I did PCA on control porbes, so we can see whether there are bach effects between different studies, but within same type of tissue.
library(tidyverse)
library(minfi)
library(magrittr)
library(tibble)
library(data.table)
library(limma)
library(ewastools)
library(FactoMineR)
library(doParallel)
library(ggplot2)
library(RColorBrewer)
pal <- brewer.pal(8, "Dark2")
# GEOmeta, GEO_phenotypes
load(file = "/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_phenotypes_add10.RData")
#GEO_RGset_450k_normalPlacenta, GEO_phenoData_450k_normalPlacenta,
#GEO_RGset_EPIC_normalPlacenta, GEO_phenoData_EPIC_normalPlacenta,
#GEO_RGset_EPIC_normalPlacenta_ourStudy, GEO_phenoData_EPIC_normalPlacenta_ourStudy
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalPlacenta.RData")
#GEO_RGset_450k_Amnion, GEO_phenoData_450k_Amnion, GEO_RGset_EPIC_Amnion, GEO_phenoData_EPIC_Amnion
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalAmnion.RData")
#GEO_RGset_450k_Chorion, GEO_phenoData_450k_Chorion,GEO_RGset_EPIC_Chorion, GEO_phenoData_EPIC_Chorion
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalChorion.RData")
#GEO_RGset_450k_Decidua, GEO_phenoData_450k_Decidua, GEO_RGset_EPIC_Decidua, GEO_phenoData_EPIC_Decidua,
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalDecidua.RData")
#GEO_RGset_450k_MaternalBlood, GEO_phenoData_450k_MaternalBlood,
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalMaternalBlood.RData")
#GEO_RGset_450k_mesenchyme, GEO_phenoData_450k_mesenchyme
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalPlacentalMesenchyme.RData")
#GEO_RGset_450k_trophoblast, GEO_phenoData_450k_trophoblast
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalPlacentalTrophoblast.RData")
#GEO_RGset_450k_UC, GEO_phenoData_450k_UC
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalUmbilicalCord.RData")
#GEO_RGset_450k_UCB, GEO_phenoData_450k_UCB
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalUmbilicalCordBlood.RData")
# let argument RGset to be a charactor
# argument `group`, group could be `trimester`, `fetalSex` `gestationalAge` or `Study`
ControlPCA <- function(RGsetChara, group){
registerDoParallel(cores = 10)
# get the object from character
RGset <- get(RGsetChara)
# get meta data
phenoData <- pData(RGset)
# get intensity of control probes:
ctrls <- getProbeInfo(RGset, type = "Control")
M.Neg <- getGreen(RGset)[ctrls$Address[ctrls$Type == "NEGATIVE"], , drop=FALSE]
U.Neg <- getRed(RGset)[ctrls$Address[ctrls$Type == "NEGATIVE"], , drop=FALSE]
# calculate beta values and M values
beta_ctrls <- M.Neg/(M.Neg+U.Neg+100) %>% as.matrix()
M_ctrls <- log2(beta_ctrls/(1 - beta_ctrls))
M_ctrls_v1 <- M_ctrls[apply(M_ctrls, 1, function(x) all(is.finite(x))), , drop=FALSE]
# remove infinite values and draw PCA plots for data sets with mutiple samples
if(ncol(M_ctrls)==1){
print("no PCA plot, since only 1 sample for this data set")
}else if(ncol(M_ctrls)==2){
print("no PCA plot, since only 2 samples for this data set")
}else{
## PCA plot for control M values
PCA_M_ctrl <- FactoMineR::PCA(base::t(M_ctrls_v1))
## relabel PCA_M_ctrl plot using ggplot:
PCA_M_ctrl_plot.df <- data.frame(x = PCA_M_ctrl$ind$coord[,1], y = PCA_M_ctrl$ind$coord[,2],
fetalSex=phenoData$Fetal_Sex,
gestationalAge=phenoData$Gestation,
trimester=phenoData$Trimester,
Study= phenoData$Study)
PCA_M_ctrl_plot <-
PCA_M_ctrl_plot.df %>%
ggplot(aes(x = x, y = y)) +
## add argument `group`, group could be `trimester`, `fetalSex` `gestationalAge` or `Study`
geom_point(aes(col = get(group)), alpha = 1, size = 7)+ #color = factor(Batch),
scale_colour_hue()+
labs(x="PCA1", y="PCA2", colour=group)+
ggtitle(RGsetChara)+
theme(text = element_text(size=20))
print(PCA_M_ctrl_plot)
} # end if
# values to return
ControlBMList <- list(ControlB=beta_ctrls, ControlM=M_ctrls_v1)
return(ControlBMList)
# saveRDS(PCA_M_ctrl, file = "/home/qianhui/DNAme/Process_decidual/RDSfiles/PCA_M_ctrl.rds")
}
ControlPCA for each tissue types: 450K & EPIC (data from GEO data base)# !(GEO_phenotypes$Sample_name%in%c("GSM1617002_7668610068_R01C01","GSM1616993_7668610068_R06C02"))
GEO_RGset_450k_normalPlacenta <- GEO_RGset_450k_normalPlacenta[,-5]
GEO_RGset_450k_MaternalBlood <- GEO_RGset_450k_MaternalBlood[,-8]
TissueRGsetNames <- c(
"GEO_RGset_450k_normalPlacenta", "GEO_RGset_EPIC_normalPlacenta",
"GEO_RGset_EPIC_normalPlacenta_ourStudy",
"GEO_RGset_450k_Amnion", "GEO_RGset_EPIC_Amnion",
"GEO_RGset_450k_Chorion", "GEO_RGset_EPIC_Chorion",
"GEO_RGset_450k_Decidua", "GEO_RGset_EPIC_Decidua",
"GEO_RGset_450k_MaternalBlood", "GEO_RGset_450k_mesenchyme",
"GEO_RGset_450k_trophoblast",
"GEO_RGset_450k_UC", "GEO_RGset_450k_UCB"
)
for(i in TissueRGsetNames){
assign(paste0("ControlBM", str_replace_all(i, "GEO_RGset", "")),
ControlPCA(RGsetChara = i, group = "Study"))
}
## Loading required package: IlluminaHumanMethylation450kmanifest
## Loading required package: IlluminaHumanMethylationEPICmanifest
## [1] "no PCA plot, since only 1 sample for this data set"
## [1] "no PCA plot, since only 2 samples for this data set"
## [1] "no PCA plot, since only 1 sample for this data set"
ControlBM_450k_normalPlacenta_plotByTrimester <- ControlPCA(RGsetChara = "GEO_RGset_450k_normalPlacenta", group = "trimester")
save(list=c(paste0("ControlBM", str_replace_all(i, "GEO_RGset", ""))),
file = "/home/qianhui/DNAme/Process_decidual/RDSfiles/ControlBMof9TissueTypes.RData")
JoinAndPlotAllControlControlBMvalInfo <- tibble(
TissueRGsetNames = c("GEO_RGset_450k_normalPlacenta", "GEO_RGset_EPIC_normalPlacenta",
"GEO_RGset_EPIC_normalPlacenta_ourStudy",
"GEO_RGset_450k_Amnion", "GEO_RGset_EPIC_Amnion",
"GEO_RGset_450k_Chorion", "GEO_RGset_EPIC_Chorion",
"GEO_RGset_450k_Decidua", "GEO_RGset_EPIC_Decidua",
"GEO_RGset_450k_MaternalBlood", "GEO_RGset_450k_mesenchyme",
"GEO_RGset_450k_trophoblast",
"GEO_RGset_450k_UC", "GEO_RGset_450k_UCB"
),
ControlBMvalNames = paste0("ControlBM", str_replace_all(TissueRGsetNames, "GEO_RGset", ""))
)
# write the function to join all control informations together
# value-- Beta or M
# TissueBMvalInfo -- a tibble containing BM names
JoinAndPlotAllControl <-
function(ControlBMvalInfo, value, GEO_phenotypes){
registerDoParallel(cores = 10)
# list
ListToJoin <- list()
# names for the list
listElements <- paste0(ControlBMvalInfo$ControlBMvalNames, ".df")
# fill the list with beta/M values
if(value=="Beta"){
for(Name in listElements){
TissueBMvalNames <- str_replace_all(Name, ".df", "")
ListToJoin[[Name]] <- get(TissueBMvalNames)$ControlB %>% as.data.frame() %>% rownames_to_column(var="Probe")
}
}else if(value=="M"){
for(Name in listElements){
TissueBMvalNames <- str_replace_all(Name, ".df", "")
ListToJoin[[Name]] <- get(TissueBMvalNames)$ControlM %>% as.data.frame() %>% rownames_to_column(var="Probe")
}
} # end if
# left_join values form all samples
Join.df <- ListToJoin %>% purrr::reduce(left_join, by = "Probe")
# omit na values
Join.df_v1 <- na.omit(Join.df)
# as matrix
Join.df_v2 <- Join.df_v1 %>% remove_rownames() %>%
column_to_rownames(var="Probe") %>% as.matrix() #
#plotMDS plots
## phenodata for these samples
GEO_phenotypes_551 <- GEO_phenotypes[match(colnames(Join.df_v2),
GEO_phenotypes$Sample_name),]
## check names all in same order or not
table(colnames(Join.df_v2)==GEO_phenotypes_551$Sample_name) %>% print()
## plot MDS plot
par(mfrow=c(1,1))
MDS_control <- plotMDS(Join.df_v2, top=nrow(Join.df_v2), labels= GEO_phenotypes_551$Sample, gene.selection="common")
# plot PCA
PCA_control <- FactoMineR::PCA(base::t(Join.df_v2))
# return Join.df_v2, GEO_phenotypes_norm552 and MDS_norm552
output_List <- list(JoinAllControl=Join.df_v2, phenoDataAll=GEO_phenotypes_551,
MDS_control=MDS_control, PCA_control=PCA_control)
return(output_List)
}
JoinAndPlotAllControlControl551_Beta <- JoinAndPlotAllControl(ControlBMvalInfo = ControlBMvalInfo,
value = "Beta",
GEO_phenotypes = GEO_phenotypes)
##
## TRUE
## 551
Control551_M <- JoinAndPlotAllControl(ControlBMvalInfo = ControlBMvalInfo,
value = "M",
GEO_phenotypes = GEO_phenotypes)
##
## TRUE
## 551
save(Control551_Beta, Control551_M,
file = "/home/qianhui/DNAme/Process_decidual/RDSfiles/Controls_BM_all551.RData")